Data importation

Import the data sets extracted from the UCPH_Data Preparation R Markdown.

list.files()
## [1] "endpoint.txt"           "plant_info.txt"         "S_timeseries.txt"      
## [4] "T_timeseries.txt"       "testtemplate"           "timeseries.txt"        
## [7] "UCPH_DataAnalysis.html" "UCPH_DataAnalysis.Rmd"
plant_info <- read.table("plant_info.txt", header = TRUE, sep = "\t")
endpoint <- read.table("endpoint.txt", header = TRUE, sep = "\t")
S_timeseries <- read.table("S_timeseries.txt", header = TRUE, sep = "\t")

We must convert the columns to factor and date formats.

# plant_info
plant_info <- lapply(plant_info, factor)

# endpoint
matching_cols <- intersect(names(endpoint), names(plant_info))
endpoint[, matching_cols] <- lapply(endpoint[, matching_cols], factor)
endpoint$Date <- date(endpoint$Date)
endpoint$Timestamp <- NA

# timeseries
# No data for UCPH

# S_timeseries
matching_cols <- intersect(names(S_timeseries), names(plant_info))
S_timeseries[, matching_cols] <- lapply(S_timeseries[, matching_cols], factor)
S_timeseries$Timestamp <- as.POSIXct(S_timeseries$Timestamp, format = "%Y-%m-%d %H:%M:%S")
S_timeseries$Date <- date(S_timeseries$Date)

# T_timeseries
# No data for UCPH

1. Endpoint dataframe

A. Exploration of data

List of variables

This part extracts the variables in the endpoint dataframe.

df <- endpoint[,colSums(is.na(endpoint))<nrow(endpoint)]
genotype_index <- which(colnames(df) == "Genotype")

variables <- colnames(df[, c(3:(genotype_index - 1))]) # We remove the three first columns that are "Unit.ID","Time" and "Date"
variables
## [1] "DW_shoot_g" "DW_root_g"

The variables for NaPPI are “DW_shoot_g” and “FW_shoot_g”

Exploration tables using the janitor and skimr packages

get_summary_stats(data = endpoint, 
                  variables[1], variables[2],
                  type = "common")
## # A tibble: 2 × 10
##   variable       n   min   max median   iqr  mean    sd    se    ci
##   <fct>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 DW_shoot_g   107 0.768     8   3.22  2.00 3.46  1.65   0.16 0.317
## 2 DW_root_g    108 0.1       5   0.8   0.7  0.982 0.832  0.08 0.159
endpoint %>% 
  count(Genotype)
##   Genotype  n
## 1   EPPN_T 12
## 2  EPPN1_H 12
## 3  EPPN1_L 12
## 4  EPPN2_H 12
## 5  EPPN2_L 12
## 6  EPPN3_H 12
## 7  EPPN3_L 12
## 8  EPPN4_H 12
## 9  EPPN4_L 12
skim(endpoint[, unlist(variables)])
Data summary
Name endpoint[, unlist(variabl…
Number of rows 108
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
DW_shoot_g 1 0.99 3.46 1.65 0.77 2.3 3.22 4.3 8 ▅▇▅▂▁
DW_root_g 0 1.00 0.98 0.83 0.10 0.5 0.80 1.2 5 ▇▂▁▁▁

Boxplots, density histograms and qqPlots

## Boxplots

# We add a variable PlantType
endpoint$Plant_type <- substr(endpoint$Genotype, nchar(as.character(endpoint$Genotype)), nchar(as.character(endpoint$Genotype)))

boxplot1 <- ggplot(endpoint, aes(y = endpoint[ , colnames(endpoint)==variables[1]]))+
  geom_boxplot(aes(x = Genotype, fill = Plant_type))+
  labs(title = variables[1], y = variables[1])

boxplot2 <- ggplot(endpoint, aes(y = endpoint[ , colnames(endpoint)==variables[2]]))+
  geom_boxplot(aes(x = Genotype, fill = Plant_type))+
  labs(title = variables[2], y = variables[2])

combined <- boxplot1 + boxplot2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

B. Normality hypothesis and outlier detection

Remove the outliers, replacing them with NULL values and normality verification of residuals.

# Create a clean dataset that will contain the data without the outlyers
endpoint_clean <- endpoint


## Outlayer detection function
isoutlayer <- function(independent_variable, genotype, proba = 0.05) {
  data <- data.frame(genotype, independent_variable)
  data <- na.omit(data)
  
  data$genotype <- as.factor(data$genotype)
  
  
  # Linear model fit
  fit <- lm(independent_variable ~ genotype, data)
  
  # Density histogram of residuals - Normality test
  density_histogram <- ggplot(data = data.frame(x = fit$residuals), aes(x = x)) +
    geom_histogram(aes(y = ..density..), bins = 20, color = 4, fill = 4, alpha = 0.5) +
    labs(title = "Density histogram of residuals", x = "Residuals", y = "Density")+
    geom_density(color = "blue", size = 0.7, linetype = "dashed")+
    stat_function(fun = dnorm, args = list(mean = mean(fit$residuals), sd = sd(fit$residuals)), color = "red", size = 0.7)
    
  # Boxplot of residuals
  boxplot <- ggplot(data = data.frame(x = fit$residuals, genotype = data$genotype), aes(x = genotype, y = x, fill = genotype))+
    geom_boxplot() +
    labs(title = "Boxplot of residuals", y = deparse(substitute(independent_variable)))
  
  # qqPlot of residuals
  qqplot <- qqPlot(fit$residuals)
  
  # Show graphs
  grid.arrange(density_histogram, boxplot, ncol = 2)
  print(qqplot)
  
  # Return logical vector for the outlayers values
  return(abs(fit$residuals) > 2.0 * summary(fit)$sigma) 
}

# Run the function on the dataset for all the variables
endpoint_clean[[variables[1]]][isoutlayer(endpoint[[variables[1]]], endpoint$Genotype)]
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##  31 101 
##  31 100
## [1] 7.804054 5.242991 5.769168 1.924528
endpoint_clean[[variables[2]]][isoutlayer(endpoint[[variables[2]]], endpoint$Genotype)]

## [1] 55 77
## [1] 4.3 5.0 3.5 4.0

Violin and sina plots after outlier detection

ATTENTION ICI CHANGER LES NOMS DES VARIABLES

# Violin and sina plots

violin1 <- ggplot(endpoint_clean, aes(y = DW_shoot_g, x = Genotype))+
  geom_violin(aes(fill = Plant_type), color = "white", alpha = 0.4)+
  geom_sina(size=1, aes(color = Plant_type))+
  theme(legend.position = "none")+
  labs(title = variables[1])+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotation des étiquettes des dates
        panel.grid.major.x = element_line(color = "lightgray", size = 0.5),  # Paramètres de la grille
        panel.grid.minor.x = element_blank())  # Supprimer les lignes de grille mineures
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
violin2 <- ggplot(endpoint_clean, aes(y = DW_root_g, x = Genotype))+
  geom_violin(aes(fill = Plant_type), color = "white", alpha = 0.4)+
  geom_sina(size=1, aes(color = Plant_type))+
  theme(legend.position = "none")+
  labs(title = variables[2])+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotation des étiquettes des dates
        panel.grid.major.x = element_line(color = "lightgray", size = 0.5),  # Paramètres de la grille
        panel.grid.minor.x = element_blank())  # Supprimer les lignes de grille mineures

combined <- violin1 + violin2 & theme(legend.position = "bottom")
combined + plot_layout(guides = "collect")
## Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 1 rows containing non-finite values (`stat_sina()`).

Exploration statistics for the variables after outlier detection

ATTENTION ICI CHANGER LES NOMS DES VARIABLES

skim(endpoint_clean[, unlist(variables)])
Data summary
Name endpoint_clean[, unlist(v…
Number of rows 108
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
DW_shoot_g 1 0.99 3.46 1.65 0.77 2.3 3.22 4.3 8 ▅▇▅▂▁
DW_root_g 0 1.00 0.98 0.83 0.10 0.5 0.80 1.2 5 ▇▂▁▁▁
# For "DW_shoot_g"
endpoint_clean %>% 
  group_by(Genotype) %>% 
  summarize(mean    = mean(DW_shoot_g, na.rm=TRUE),
            std.dev = sd(DW_shoot_g, na.rm=TRUE),
            n_missing  = sum(is.na(DW_shoot_g))) %>% 
  arrange(desc(mean)) %>% # sort
  print(n=Inf) # print full table
## # A tibble: 9 × 4
##   Genotype  mean std.dev n_missing
##   <fct>    <dbl>   <dbl>     <int>
## 1 EPPN4_L   3.95    1.56         1
## 2 EPPN1_H   3.92    1.81         0
## 3 EPPN2_L   3.71    1.87         0
## 4 EPPN1_L   3.48    1.66         0
## 5 EPPN3_H   3.45    2.16         0
## 6 EPPN3_L   3.38    1.10         0
## 7 EPPN4_H   3.31    2.03         0
## 8 EPPN_T    3.15    1.43         0
## 9 EPPN2_H   2.81    1.18         0
# For "DW_root_g"
endpoint_clean %>% 
  group_by(Genotype) %>% 
  summarize(mean    = mean(DW_root_g, na.rm=TRUE),
            std.dev = sd(DW_root_g, na.rm=TRUE),
            n_missing  = sum(is.na(DW_root_g))) %>% 
  arrange(desc(mean)) %>% # sort
  print(n=Inf) # print full table
## # A tibble: 9 × 4
##   Genotype  mean std.dev n_missing
##   <fct>    <dbl>   <dbl>     <int>
## 1 EPPN1_H  1.49    1.34          0
## 2 EPPN_T   1.07    1.29          0
## 3 EPPN3_H  1.05    1.07          0
## 4 EPPN2_L  0.983   0.595         0
## 5 EPPN1_L  0.975   0.567         0
## 6 EPPN4_L  0.9     0.533         0
## 7 EPPN3_L  0.892   0.378         0
## 8 EPPN4_H  0.817   0.629         0
## 9 EPPN2_H  0.667   0.438         0

C. Statistical models for phenotypic traits

La variable explicative(X) sera le génotype, variable catégorielle. Les réponses(Y) sont les données phénotypiques (dans ce cas-ci la FW_shoot_g et la Measured_plant_height_cm)

unique(endpoint$Genotype)
## [1] EPPN2_H EPPN_T  EPPN1_H EPPN1_L EPPN3_L EPPN4_H EPPN2_L EPPN3_H EPPN4_L
## 9 Levels: EPPN_T EPPN1_H EPPN1_L EPPN2_H EPPN2_L EPPN3_H EPPN3_L ... EPPN4_L
variables
## [1] "DW_shoot_g" "DW_root_g"

ATTENTION ICI CHANGER LES VARIABLES ### 1. First linear models Firstly, we model the Y = X + r + c + e Where - Y is the phenotypic trait; - X the genotype; - r the row effect (fixed or random); - c the column effect (fixed or random);

Models for DW_shoot_g and FW_shoot_g with fixed or random effects of Row and Column.

# Linear model of "DW_shoot_g" with Rows and Columns as fixed effects
mod1 <- lm(DW_shoot_g ~ Genotype + Row + Column, endpoint_clean)
summary(mod1)
## 
## Call:
## lm(formula = DW_shoot_g ~ Genotype + Row + Column, data = endpoint_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1929 -0.9016 -0.1632  0.9502  3.1679 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      2.22535    0.83657   2.660  0.00946 **
## GenotypeEPPN1_H  0.64568    0.66386   0.973  0.33372   
## GenotypeEPPN1_L  0.34784    0.66861   0.520  0.60435   
## GenotypeEPPN2_H -0.40655    0.66618  -0.610  0.54344   
## GenotypeEPPN2_L  0.63701    0.66634   0.956  0.34200   
## GenotypeEPPN3_H  0.21468    0.66620   0.322  0.74812   
## GenotypeEPPN3_L  0.37039    0.66623   0.556  0.57982   
## GenotypeEPPN4_H  0.16653    0.66873   0.249  0.80399   
## GenotypeEPPN4_L  0.79085    0.68435   1.156  0.25131   
## Row2             0.54325    0.76386   0.711  0.47906   
## Row3             0.09322    0.79172   0.118  0.90657   
## Row4             0.21972    0.76386   0.288  0.77437   
## Row5             0.77570    0.76386   1.015  0.31297   
## Row6             0.39310    0.76386   0.515  0.60826   
## Row7             0.88715    0.76386   1.161  0.24897   
## Row8            -0.30547    0.76386  -0.400  0.69031   
## Row9            -0.82113    0.76386  -1.075  0.28566   
## Row10            0.55524    0.76386   0.727  0.46944   
## Row11           -0.23882    0.76386  -0.313  0.75537   
## Row12            0.38861    0.76386   0.509  0.61234   
## Column2          0.03488    0.66870   0.052  0.95853   
## Column3          1.68226    0.66623   2.525  0.01357 * 
## Column4          0.55525    0.66859   0.830  0.40877   
## Column5          1.34309    0.68180   1.970  0.05235 . 
## Column6          0.57524    0.66621   0.863  0.39051   
## Column7          1.33187    0.66874   1.992  0.04987 * 
## Column8         -0.22411    0.66385  -0.338  0.73656   
## Column9          1.24077    0.66623   1.862  0.06627 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.62 on 79 degrees of freedom
##   (1 observation effacée parce que manquante)
## Multiple R-squared:  0.2838, Adjusted R-squared:  0.03899 
## F-statistic: 1.159 on 27 and 79 DF,  p-value: 0.3003
anova(mod1)
## Analysis of Variance Table
## 
## Response: DW_shoot_g
##           Df  Sum Sq Mean Sq F value Pr(>F)  
## Genotype   8  12.508  1.5635  0.5955 0.7789  
## Row       11  24.134  2.1940  0.8356 0.6053  
## Column     8  45.544  5.6930  2.1682 0.0388 *
## Residuals 79 207.427  2.6257                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Linear model of "DW_shoot_g" with Rows and Columns as random effects
mod2 <- lmer(DW_shoot_g ~ Genotype + (1|Row) + (1|Column), endpoint_clean)
## boundary (singular) fit: see help('isSingular')
summary(mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DW_shoot_g ~ Genotype + (1 | Row) + (1 | Column)
##    Data: endpoint_clean
## 
## REML criterion at convergence: 399.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8027 -0.7516 -0.1208  0.6252  2.4692 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Row      (Intercept) 0.0000   0.0000  
##  Column   (Intercept) 0.2687   0.5184  
##  Residual             2.5706   1.6033  
## Number of obs: 107, groups:  Row, 12; Column, 9
## 
## Fixed effects:
##                 Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)       3.1533     0.4957 76.6159   6.361 1.33e-08 ***
## GenotypeEPPN1_H   0.7040     0.6558 90.5805   1.073    0.286    
## GenotypeEPPN1_L   0.3415     0.6584 91.6671   0.519    0.605    
## GenotypeEPPN2_H  -0.3729     0.6571 91.1193  -0.568    0.572    
## GenotypeEPPN2_L   0.6035     0.6571 91.1568   0.918    0.361    
## GenotypeEPPN3_H   0.2566     0.6571 91.1237   0.391    0.697    
## GenotypeEPPN3_L   0.3077     0.6571 91.1321   0.468    0.641    
## GenotypeEPPN4_H   0.1642     0.6584 91.6951   0.249    0.804    
## GenotypeEPPN4_L   0.8041     0.6726 91.4143   1.196    0.235    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GEPPN1_H GEPPN1_L GEPPN2_H GEPPN2_L GEPPN3_H GEPPN3_L
## GntyEPPN1_H -0.661                                                      
## GntyEPPN1_L -0.664  0.498                                               
## GntyEPPN2_H -0.663  0.499    0.501                                      
## GntyEPPN2_L -0.663  0.497    0.499    0.500                             
## GntyEPPN3_H -0.663  0.501    0.501    0.500    0.498                    
## GntyEPPN3_L -0.663  0.497    0.501    0.500    0.502    0.498           
## GntyEPPN4_H -0.664  0.498    0.504    0.499    0.501    0.499    0.501  
## GntyEPPN4_L -0.649  0.490    0.492    0.488    0.488    0.491    0.488  
##             GEPPN4_H
## GntyEPPN1_H         
## GntyEPPN1_L         
## GntyEPPN2_H         
## GntyEPPN2_L         
## GntyEPPN3_H         
## GntyEPPN3_L         
## GntyEPPN4_H         
## GntyEPPN4_L  0.492  
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
ranova(mod2)
## boundary (singular) fit: see help('isSingular')
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## DW_shoot_g ~ Genotype + (1 | Row) + (1 | Column)
##              npar  logLik    AIC    LRT Df Pr(>Chisq)  
## <none>         12 -199.65 423.31                       
## (1 | Row)      11 -199.65 421.31 0.0000  1    1.00000  
## (1 | Column)   11 -201.13 424.25 2.9479  1    0.08599 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Linear model of "DW_root_g" with Rows and Columns as random effects
mod3 <- lm(DW_root_g ~ Genotype + Row + Column, endpoint_clean)
summary(mod3)
## 
## Call:
## lm(formula = DW_root_g ~ Genotype + Row + Column, data = endpoint_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4165 -0.4383 -0.1346  0.3884  3.1027 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.53168    0.42684   1.246   0.2165  
## GenotypeEPPN1_H  0.37533    0.33889   1.108   0.2714  
## GenotypeEPPN1_L -0.08119    0.34131  -0.238   0.8126  
## GenotypeEPPN2_H -0.38209    0.34007  -1.124   0.2646  
## GenotypeEPPN2_L -0.06935    0.34010  -0.204   0.8389  
## GenotypeEPPN3_H -0.03992    0.34008  -0.117   0.9069  
## GenotypeEPPN3_L -0.15029    0.34010  -0.442   0.6598  
## GenotypeEPPN4_H -0.27947    0.34133  -0.819   0.4154  
## GenotypeEPPN4_L -0.20697    0.34130  -0.606   0.5460  
## Row2             0.31111    0.38994   0.798   0.4273  
## Row3             0.16667    0.38994   0.427   0.6702  
## Row4            -0.05556    0.38994  -0.142   0.8871  
## Row5             0.46667    0.38994   1.197   0.2349  
## Row6             0.14444    0.38994   0.370   0.7120  
## Row7             0.64444    0.38994   1.653   0.1023  
## Row8             0.01111    0.38994   0.028   0.9773  
## Row9            -0.05556    0.38994  -0.142   0.8871  
## Row10            0.41111    0.38994   1.054   0.2949  
## Row11           -0.25556    0.38994  -0.655   0.5141  
## Row12            0.26667    0.38994   0.684   0.4960  
## Column2          0.05549    0.34130   0.163   0.8713  
## Column3          0.55875    0.34010   1.643   0.1043  
## Column4          0.33024    0.34130   0.968   0.3362  
## Column5          0.72117    0.34010   2.120   0.0371 *
## Column6          0.59243    0.34009   1.742   0.0854 .
## Column7          0.72249    0.34131   2.117   0.0374 *
## Column8          0.12649    0.33888   0.373   0.7099  
## Column9          0.24179    0.34010   0.711   0.4792  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8272 on 80 degrees of freedom
## Multiple R-squared:  0.2605, Adjusted R-squared:  0.01086 
## F-statistic: 1.044 on 27 and 80 DF,  p-value: 0.4257
anova(mod3)
## Analysis of Variance Table
## 
## Response: DW_root_g
##           Df Sum Sq Mean Sq F value Pr(>F)
## Genotype   8  4.959 0.61988  0.9060 0.5159
## Row       11  6.643 0.60393  0.8826 0.5605
## Column     8  7.676 0.95949  1.4023 0.2084
## Residuals 80 54.738 0.68423
# Linear model of "DW_root_g" with Rows and Columns as random effects
mod4 <- lmer(DW_root_g ~ Genotype + (1|Row) + (1|Column), endpoint_clean)
## boundary (singular) fit: see help('isSingular')
summary(mod4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DW_root_g ~ Genotype + (1 | Row) + (1 | Column)
##    Data: endpoint_clean
## 
## REML criterion at convergence: 267.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5625 -0.5655 -0.2342  0.3342  4.6592 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. 
##  Row      (Intercept) 1.704e-15 4.128e-08
##  Column   (Intercept) 2.436e-02 1.561e-01
##  Residual             6.743e-01 8.211e-01
## Number of obs: 108, groups:  Row, 12; Column, 9
## 
## Fixed effects:
##                 Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)      1.06919    0.24317 90.91275   4.397 2.97e-05 ***
## GenotypeEPPN1_H  0.41021    0.33558 91.94626   1.222    0.225    
## GenotypeEPPN1_L -0.08852    0.33629 93.57954  -0.263    0.793    
## GenotypeEPPN2_H -0.39473    0.33593 92.77403  -1.175    0.243    
## GenotypeEPPN2_L -0.07926    0.33593 92.78438  -0.236    0.814    
## GenotypeEPPN3_H -0.02353    0.33593 92.77748  -0.070    0.944    
## GenotypeEPPN3_L -0.16777    0.33593 92.78441  -0.499    0.619    
## GenotypeEPPN4_H -0.25885    0.33629 93.58608  -0.770    0.443    
## GenotypeEPPN4_L -0.17861    0.33629 93.57631  -0.531    0.597    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GEPPN1_H GEPPN1_L GEPPN2_H GEPPN2_L GEPPN3_H GEPPN3_L
## GntyEPPN1_H -0.690                                                      
## GntyEPPN1_L -0.691  0.499                                               
## GntyEPPN2_H -0.691  0.499    0.501                                      
## GntyEPPN2_L -0.691  0.498    0.499    0.500                             
## GntyEPPN3_H -0.691  0.501    0.501    0.500    0.499                    
## GntyEPPN3_L -0.691  0.498    0.501    0.500    0.501    0.499           
## GntyEPPN4_H -0.691  0.499    0.502    0.499    0.501    0.499    0.501  
## GntyEPPN4_L -0.691  0.500    0.501    0.499    0.501    0.501    0.499  
##             GEPPN4_H
## GntyEPPN1_H         
## GntyEPPN1_L         
## GntyEPPN2_H         
## GntyEPPN2_L         
## GntyEPPN3_H         
## GntyEPPN3_L         
## GntyEPPN4_H         
## GntyEPPN4_L  0.502  
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
anova(mod4)
## Type III Analysis of Variance Table with Satterthwaite's method
##          Sum Sq Mean Sq NumDF  DenDF F value Pr(>F)
## Genotype 4.7479 0.59349     8 92.974  0.8802 0.5362
ranova(mod4)
## boundary (singular) fit: see help('isSingular')
## ANOVA-like table for random-effects: Single term deletions
## 
## Model:
## DW_root_g ~ Genotype + (1 | Row) + (1 | Column)
##              npar  logLik    AIC     LRT Df Pr(>Chisq)
## <none>         12 -133.57 291.14                      
## (1 | Row)      11 -133.57 289.14 0.00000  1     1.0000
## (1 | Column)   11 -133.83 289.66 0.51949  1     0.4711
# Linear model of Soil?
#mod3 <- lm(Soil ~ Genotype + (1|Row) + (1|Column), endpoint_clean)
#summary(mod3)
#anova(mod3)

2. Linear models with Plant_type

Model with X as Plant_type instead of Genotype, and row and column effects as random effects. Plant_type is defined as H for Hybrid, L for pure Line and T for Tester.

# Linear model of DW_shoot_g with Rows and Columns as random effects
endpoint_clean$Plant_type <- as.factor(endpoint_clean$Plant_type)
# Rajouter colonne avec le "numéro" de génotype

endpoint_clean$Plant_type <- relevel(endpoint_clean$Plant_type, ref = "T") # T as base level
mod <- lmer(DW_shoot_g ~ Plant_type + (1|Row) + (1|Column), endpoint_clean)
## boundary (singular) fit: see help('isSingular')
summary(mod)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: DW_shoot_g ~ Plant_type + (1 | Row) + (1 | Column)
##    Data: endpoint_clean
## 
## REML criterion at convergence: 407.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7872 -0.7322 -0.1474  0.5911  2.5756 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Row      (Intercept) 0.0000   0.000   
##  Column   (Intercept) 0.2883   0.537   
##  Residual             2.4959   1.580   
## Number of obs: 107, groups:  Row, 12; Column, 9
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)   3.1516     0.4917 76.9543   6.410 1.06e-08 ***
## Plant_typeH   0.1873     0.5118 96.9742   0.366    0.715    
## Plant_typeL   0.5131     0.5136 97.3840   0.999    0.320    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Plnt_H
## Plant_typeH -0.833       
## Plant_typeL -0.832  0.798
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

3. Linear models with asreml library

modasreml <- asreml(fixed = DW_shoot_g ~ Genotype,
                    random = ~ Row + Column,
                    residual = ~  NULL,
                    data = endpoint_clean)
## ASReml Version 4.2 24/05/2024 09:10:35
##           LogLik        Sigma2     DF     wall
##  1     -110.8407      2.459644     98   09:10:35
##  2     -109.7897      2.537136     98   09:10:35  (  1 restrained)
##  3     -109.6061      2.567283     98   09:10:35  (  1 restrained)
##  4     -109.5973      2.570370     98   09:10:35  (  1 restrained)
##  5     -109.5968      2.570553     98   09:10:35  (  1 restrained)
plot(modasreml)

summary(modasreml)$varcomp
##            component std.error  z.ratio bound %ch
## Column  2.687275e-01 0.2464866 1.090232     P   0
## Row     8.472656e-07        NA       NA     B  NA
## units!R 2.570553e+00 0.4046865 6.351961     P   0

PROBLEME DANS CE BLOC

4. Linear models with Soil variable

Model with Soil as explicative variable.

PROBLEME DANS CE BLOC

ANALYSE DES DONNEES MULTIVARIEES

PROBLEME DANS CE BLOC PCA, clustering, etc, voir p.56 biométrie 1

2. Exploration of the timeseries data

In this part, we look at the timeseries, S_timeseries and T_timeseries datasets.

Number of data observations per day for the traits of the timeseries datasets

REPLACER DANS LE CODE APRES

h1 <- ggplot(timeseries, aes(x = Date)) + geom_bar(aes(fill = Genotype), position = “stack”, width = 1) + scale_fill_viridis_d(option = “D”) + labs(x = “Date”, y = “Number of observations”, title = “Observations per day for timeseries_shoot_and_plant”) + scale_y_continuous(breaks = seq(from = 0, to = 325, by = 25)) + scale_x_date(date_breaks = “2 days”, date_labels = “%d-%m-%Y”) + # Exemple de format de date (%d-%m-%Y) theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates panel.grid.major.x = element_line(color = “lightgray”, size = 0.5), # Paramètres de la grille panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures

h3 <- ggplot(T_timeseries, aes(x = Date)) + geom_bar(aes(fill = Genotype), position = “stack”, width = 1) + scale_fill_viridis_d(option = “D”) + labs(x = “Date”, y = “Number of observations”, title = “Observations per day for T_timeseries_shoot”) + scale_y_continuous(breaks = seq(from = 0, to = 325, by = 25)) + scale_x_date(date_breaks = “2 days”, date_labels = “%d-%m-%Y”) + # Exemple de format de date (%d-%m-%Y) theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotation des étiquettes des dates panel.grid.major.x = element_line(color = “lightgray”, size = 0.5), # Paramètres de la grille panel.grid.minor.x = element_blank()) # Supprimer les lignes de grille mineures

combined <- h1 + h2 + h3 & theme(legend.position = “top”) combined + plot_layout(guides = “collect”)

h2 <- ggplot(S_timeseries, aes(x = Date)) +
  geom_bar(aes(fill = Genotype), position = "stack", width = 0.96) +
  scale_fill_viridis_d(option = "D") +
  labs(x = "Date", y = "Number of observations", title = "Observations per day for S_timeseries") +
  scale_y_continuous(breaks = seq(from = 0, to = 325, by = 25)) +
  scale_x_date(date_breaks = "2 days", date_labels = "%d-%m-%Y") +  # Exemple de format de date (%d-%m-%Y)
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotation des étiquettes des dates
        panel.grid.major.x = element_line(color = "lightgray", size = 0.5),  # Paramètres de la grille
        panel.grid.minor.x = element_blank())  # Supprimer les lignes de grille mineures

h2

A. Exploration of the timeseries dataframe

Variables

Firsty, we extract the variables of the timeseries dataframe.

B. Exploration of the S_timeseries dataframe

Variables

variables_1 <- "S_Height_cm"

Time point object

1. endpoint

Raw data

timePoint_endpoint <- createTimePoints(dat = endpoint,
                                      experimentName = "EPPN2020_UCPH",
                                      genotype = "Genotype",
                                      timePoint = "Date",
                                      plotId = "Unit.ID",
                                      rowNum = "Row",
                                      colNum = "Column")

summary(timePoint_endpoint)
## timePoint_endpoint contains data for experiment EPPN2020_UCPH.
## 
## It contains 1 time points.
## First time point: 2021-06-28 
## Last time point: 2021-06-28 
## 
## No check genotypes are defined.
getTimePoints(timePoint_endpoint)
##   timeNumber  timePoint
## 1          1 2021-06-28

Count the number of observations per trait.

traits <- variables

for (trait_name in traits) {
  print(paste("How many observations for", trait_name))
  num_observations <- countValid(timePoint_endpoint, trait_name)
  print(num_observations)
}
## [1] "How many observations for DW_shoot_g"
## 2021-06-28 
##        107 
## [1] "How many observations for DW_root_g"
## 2021-06-28 
##        108

Check the layout at the only timepoint.

getTimePoints(timePoint_endpoint)
##   timeNumber  timePoint
## 1          1 2021-06-28
num_timepoints <- getTimePoints(timePoint_endpoint)[1,1]

plot(timePoint_endpoint,
     plotType = "layout",
     timePoints = num_timepoints,
     highlight =  c("EPPN_T", "EPPN1_L", "EPPN1_H", "EPPN2_L", "EPPN2_H", "EPPN3_L", "EPPN3_H", "EPPN4_L", "EPPN4_H"))

Check the heatmap of the raw data at harvest.

for (trait_name in traits) {
  plot(timePoint_endpoint,
     plotType = "layout",
     timePoints = num_timepoints,
     traits = trait_name)
}

After outlier detection

timePoint_endpoint_clean <- createTimePoints(dat = endpoint_clean,
                                      experimentName = "EPPN2020_NaPPI",
                                      genotype = "Genotype",
                                      timePoint = "Date",
                                      plotId = "Unit.ID",
                                      rowNum = "Row",
                                      colNum = "Column")

Count the number of observations per trait

for (trait_name in traits) {
  print(paste("How many observations for", trait_name))
  num_observations <- countValid(timePoint_endpoint_clean, trait_name)
  print(num_observations)
}
## [1] "How many observations for DW_shoot_g"
## 2021-06-28 
##        107 
## [1] "How many observations for DW_root_g"
## 2021-06-28 
##        108

Check the heatmap of the data at harvest.

for (trait_name in traits) {
  plot(timePoint_endpoint_clean,
     plotType = "layout",
     timePoints = num_timepoints,
     traits = trait_name)
}

2. S_timeseries

For the createTimePoints we need one timepoint per Unit.ID per day. We can average the variables per day.

# Agréger les données
S_timeseries <- S_timeseries %>%
  group_by(Unit.ID, Date) %>%
  mutate(S_Height_cm_avg = mean(S_Height_cm)) %>%
  distinct(Unit.ID, Date, .keep_all = TRUE) %>%
  ungroup()
# We add a Plant_type variable
S_timeseries$Plant_type <- substr(S_timeseries$Genotype, nchar(as.character(S_timeseries$Genotype)), nchar(as.character(S_timeseries$Genotype)))

S_timeseries$Plant_type <- ifelse(S_timeseries$Plant_type %in% c("T", "L"), "Line",
                                  ifelse(S_timeseries$Plant_type == "H", "Hybrid", S_timeseries$Plant_type))


timePoint_S <- createTimePoints(dat = S_timeseries,
                              experimentName = "EPPN2020_NaPPI",
                              genotype = "Genotype",
                              timePoint = "Date",
                              plotId = "Unit.ID",
                              rowNum = "Row",
                              colNum = "Column")

summary(timePoint_S)
## timePoint_S contains data for experiment EPPN2020_NaPPI.
## 
## It contains 17 time points.
## First time point: 2021-06-13 
## Last time point: 2021-06-29 
## 
## No check genotypes are defined.
getTimePoints(timePoint_S)
##    timeNumber  timePoint
## 1           1 2021-06-13
## 2           2 2021-06-14
## 3           3 2021-06-15
## 4           4 2021-06-16
## 5           5 2021-06-17
## 6           6 2021-06-18
## 7           7 2021-06-19
## 8           8 2021-06-20
## 9           9 2021-06-21
## 10         10 2021-06-22
## 11         11 2021-06-23
## 12         12 2021-06-24
## 13         13 2021-06-25
## 14         14 2021-06-26
## 15         15 2021-06-27
## 16         16 2021-06-28
## 17         17 2021-06-29

Variables and number of observations

We choose the variables that we want to see. Count the number of observations per variable.

traits <- "S_Height_cm"

trait_name <- traits[1]

for (trait in traits) {
  print(paste("How many observations for", trait))
  valid_count <- countValid(timePoint_S, trait)
  print(valid_count)
}
## [1] "How many observations for S_Height_cm"
## 2021-06-13 2021-06-14 2021-06-15 2021-06-16 2021-06-17 2021-06-18 2021-06-19 
##        108        108        108        108        108        108        108 
## 2021-06-20 2021-06-21 2021-06-22 2021-06-23 2021-06-24 2021-06-25 2021-06-26 
##        108        108        108        108        108        108        108 
## 2021-06-27 2021-06-28 2021-06-29 
##         66        108         84

Genotypic layout

Check the genotypic layout at every timepoint.

getTimePoints(timePoint_S)
##    timeNumber  timePoint
## 1           1 2021-06-13
## 2           2 2021-06-14
## 3           3 2021-06-15
## 4           4 2021-06-16
## 5           5 2021-06-17
## 6           6 2021-06-18
## 7           7 2021-06-19
## 8           8 2021-06-20
## 9           9 2021-06-21
## 10         10 2021-06-22
## 11         11 2021-06-23
## 12         12 2021-06-24
## 13         13 2021-06-25
## 14         14 2021-06-26
## 15         15 2021-06-27
## 16         16 2021-06-28
## 17         17 2021-06-29
num_timepoints <- getTimePoints(timePoint_S)

for (i in 1:length(num_timepoints$timeNumber)) {
  plot(timePoint_S,
       plotType = "layout",
       timePoints = i,
       highlight = c("EPPN_T", "EPPN1_L", "EPPN1_H", "EPPN2_L", "EPPN2_H", "EPPN3_L", "EPPN3_H", "EPPN4_L", "EPPN4_H"))
}

Raw data visualisation

Heatmap of raw data

Check the heatmap of the raw data at all the time points

for (trait in traits) {
  for (tp in 1:length(num_timepoints$timeNumber)) {
    plot(timePoint_S,
         plotType = "layout",
         timePoints = tp,
         traits = trait)
  }
}

Time course of raw data

Check some time courses of raw data

for (trait in traits) {
  plot(timePoint_S, 
     traits = trait,
     plotType = "raw")
}

Boxplots of raw data

for (trait in traits) {
  plot(timePoint_S,
     plotType = "box",
     traits = trait)
}

Correlation plots of raw data

for (trait in traits) {
  plot(timePoint_S,
     plotType = "cor",
     traits = trait)
}

Outliers detection

For first trait

Using the SingleOut detect and single functions. We select a subset of plants to adjust the settings for the confIntSize and nnLocfit.

plantSel<- c(1,2,3,4,5,6,7,8,9,10)

cutoffA <- 1 
ci <- c(3)[cutoffA] # confidence interval
nn <- c(0.6)[cutoffA] # nearest neighbour (0 - 1 greater number smoother curve)
ce <- c(FALSE)[cutoffA]

For all the traits

for (trait_name in traits) {
  variable_name <- paste0("Single_test_", trait_name)
  
  single_test <- detectSingleOut(
    TP = timePoint_S,
    trait = trait_name,
    plotIds = plantSel,
    confIntSize = ci,
    nnLocfit = nn,
    checkEdges = TRUE
  )
  
  assign(variable_name, single_test)
  
  plot(single_test, outOnly = FALSE)
}

We can then run on all plants.

for (trait_name in traits) {
  single_test_object_name <- paste0("Single_test_", trait_name)
  Single_test <- get(single_test_object_name)
  
  # Vérifier s'il y a des valeurs aberrantes
  if (any(Single_test$outlier == 1)) {
    # Compter le nombre de valeurs aberrantes par date
    outliers_count <- with(Single_test[Single_test$outlier == 1,], table(timePoint))
    print(trait_name) 
    print(outliers_count)

    # Supprimer les valeurs aberrantes uniquement si elles existent
    Single_outliers <- removeSingleOut(timePoint_S, Single_test)
    assign(paste0("Single_outliers_", trait_name), Single_outliers)

    # Écrire les résultats dans un fichier TSV
    readr::write_tsv(Single_test, sprintf("%s/single_outliers_%s.tsv", datadir, trait_name))
  } else {
    cat("No outlier for", trait_name, "\n")
  }
}
## [1] "S_Height_cm"
## timePoint
## 2021-06-15 2021-06-17 2021-06-22 2021-06-24 2021-06-27 2021-06-29 
##          1          1          2          1          2          1

Data visualisation after outliers removal

Heatmap of data

Check the heatmap of the data with outliers detection at all the time points.

for (trait_name in traits) {
  single_outliers_name <- paste0("Single_outliers_", trait_name)
  
  if (exists(single_outliers_name)) {
    Single_outliers <- get(single_outliers_name)
    
    for (tp in 1:length(num_timepoints$timeNumber)) {
      plot(Single_outliers,
           plotType = "layout",
           timePoints = tp,
           traits = trait_name)
    }
  } else {
    cat("Aucun objet Single_outliers trouvé pour le trait", trait_name, "\n")
  }
}

Time course, boxplots and correlation plots of data

for (trait_name in traits) {
  single_outliers_name <- paste0("Single_outliers_", trait_name)
  
  if (exists(single_outliers_name)) {
    Single_outliers <- get(single_outliers_name)
    
    plot(Single_outliers, 
         traits = trait_name,
         plotType = "raw")
    
    plot(Single_outliers,
         plotType = "box",
         traits = trait_name)
    
    plot(Single_outliers,
         plotType = "cor",
         traits = trait_name)
    
  } else {
    cat("No Single_outliers object found for trait", trait_name, "\n")
  }
}

Fit a model

Fit a model for all time points with no extra fixed effects.

for (trait_name in traits) {
  # Construire le nom de l'objet Single_outliers pour le trait actuel
  single_outliers_name <- paste0("Single_outliers_", trait_name)
  
  # Vérifier s'il existe un objet Single_outliers pour ce trait
  if (exists(single_outliers_name)) {
    # Récupérer l'objet Single_outliers correspondant
    Single_outliers <- get(single_outliers_name)
    
    # Créer le modèle modTP pour le trait actuel
    assign(paste0("modTP_", trait_name), 
           fitModels(TP = Single_outliers,
                     trait = trait_name,
                     geno.decomp = "Plant_type"))
  } else {
    # Si aucun objet Single_outliers n'est trouvé, utiliser timePoint_S comme TP et le nom du trait
    assign(paste0("modTP_", trait_name), 
           fitModels(TP = timePoint_S,
                     trait = trait_name,
                     geno.decomp = "Plant_type"))
  }
}
## 2021-06-13
## 2021-06-14
## 2021-06-15
## 2021-06-16
## 2021-06-17
## 2021-06-18
## 2021-06-19
## 2021-06-20
## 2021-06-21
## 2021-06-22
## 2021-06-23
## 2021-06-24
## 2021-06-25
## 2021-06-26
## 2021-06-27
## 2021-06-28
## 2021-06-29

Model visualisation

for (trait_name in traits) {
  mod_name <- paste0("modTP_", trait_name)
  
  if (exists(mod_name)) {
    mod <- get(mod_name)
    
    for (tp in 1:length(num_timepoints$timeNumber)) {
      plot(mod,
           timePoints = tp, 
           plotType = "spatial",
           spaTrend = "percentage")
    }
    
    gif_file <- sprintf("%s/%s_mod.gif", datadir, trait_name)
    
    plot(mod,
         plotType = "timeLapse",
         spaTrend = "percentage",
         outFile = gif_file)
  } else {
    cat("Aucun modèle trouvé pour le trait", trait_name, "\n")
  }
}

## Output at: C:/Users/elise/Documents/Mémoire/Template/UCPH/UCPH_Template/testtemplate/S_Height_cm_mod.gif

for (trait_name in traits) {
  mod_name <- paste0("modTP_", trait_name)
  
  if (exists(mod_name)) {
    mod <- get(mod_name)
    
    plot(mod,
         plotType = "rawPred",
         plotLine = TRUE)
    
    plot(mod,
         plotType = "corrPred",
         plotLine = TRUE)
    
    plot(mod, 
         plotType = "herit",
         yLim = c(0, 0.5))
    
    plot(mod,
         plotType = "effDim",
         EDType = "ratio",
         yLim = c(0, 1))
    
    # Tracer le graphique pour les valeurs corrigées spatialement
    plot(mod, 
         plotType = "corrPred")
  } else {
    cat("Aucun modèle trouvé pour le trait", trait_name, "\n")
  }
}

Use the splines

trait_name <- "S_Height_cm"

modTP <- fitModels(TP = Single_outliers_S_Height_cm,
                        trait = trait_name,
                        geno.decomp = "Plant_type")
## 2021-06-13
## 2021-06-14
## 2021-06-15
## 2021-06-16
## 2021-06-17
## 2021-06-18
## 2021-06-19
## 2021-06-20
## 2021-06-21
## 2021-06-22
## 2021-06-23
## 2021-06-24
## 2021-06-25
## 2021-06-26
## 2021-06-27
## 2021-06-28
## 2021-06-29
Spatial_Corrected <- getCorrected(modTP)


#Fit the spline for every plant
knots <- c(30)
mintimepoints <- c(9)

fit.spline <- fitSpline(inDat = Spatial_Corrected, 
                             trait = paste0(trait_name, "_corr"),
                             knots = knots,
                             minNoTP = mintimepoints)

# Extracting the tables of predicted values and P-spline coefficients
predDat <- fit.spline$predDat
coefDat <- fit.spline$coefDat

For a plant selection

Time series outliers

thrCor <- c(0.9) # correlation threshold
thrPca <- c(30) # pca angle threshold
thrSlope <- c(0.7) # slope threshold

Series_test <- detectSerieOut(corrDat = Spatial_Corrected,
                           predDat = predDat,
                           coefDat = coefDat,
                           trait = paste0(trait_name, "_corr"),
                           thrCor = thrCor,
                           thrPca = thrPca,
                           thrSlope = thrSlope,
                           geno.decomp = "geno.decomp")

plot(Series_test, genotypes = levels(factor(Series_test$genotype)))

# Spatial_Corrected_Out <- Spatial_Corrected

Spatial_Corrected_Out <- removeSerieOut(dat = Spatial_Corrected,
                                        serieOut = Series_test)

readr::write_tsv(Spatial_Corrected_Out, sprintf( "%s/timeSeriesOutliers_%s.tsv", datadir, trait_name))

With the cleaned data